To identify different segments in the existing customer, based on their spending patterns as well as past interaction with the bank, using clustering algorithms, and provide recommendations to the bank on how to better market to and service these customers.
import warnings
warnings.filterwarnings("ignore")
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import pandas_profiling
import time
sns.set(color_codes=True)
%matplotlib inline
# to scale the data using z-score
from sklearn.preprocessing import StandardScaler
# to compute distances
from scipy.spatial.distance import cdist
# to perform k-means clustering and compute silhouette scores
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
# to visualize the elbow curve and silhouette scores
from yellowbrick.cluster import KElbowVisualizer, SilhouetteVisualizer
# to compute distances
from scipy.spatial.distance import pdist
# to perform hierarchical clustering, compute cophenetic correlation, and create dendrograms
from sklearn.cluster import AgglomerativeClustering
from scipy.cluster.hierarchy import dendrogram, linkage, cophenet
# read data from excel file
data = pd.read_excel('Credit Card Customer Data.xlsx')
# get columns
data.columns
Index(['Sl_No', 'Customer Key', 'Avg_Credit_Limit', 'Total_Credit_Cards',
'Total_visits_bank', 'Total_visits_online', 'Total_calls_made'],
dtype='object')
# get size of dataset
data.shape
(660, 7)
# check dataset information
data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 660 entries, 0 to 659 Data columns (total 7 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Sl_No 660 non-null int64 1 Customer Key 660 non-null int64 2 Avg_Credit_Limit 660 non-null int64 3 Total_Credit_Cards 660 non-null int64 4 Total_visits_bank 660 non-null int64 5 Total_visits_online 660 non-null int64 6 Total_calls_made 660 non-null int64 dtypes: int64(7) memory usage: 36.2 KB
# check dataset missing values
total = data.isnull().sum().sort_values(ascending=False) # total number of null values
print(total)
Sl_No 0 Customer Key 0 Avg_Credit_Limit 0 Total_Credit_Cards 0 Total_visits_bank 0 Total_visits_online 0 Total_calls_made 0 dtype: int64
# check for duplicates
data.duplicated().sum()
0
This first assessment of the dataset shows:
# check first rows of data
data.head()
| Sl_No | Customer Key | Avg_Credit_Limit | Total_Credit_Cards | Total_visits_bank | Total_visits_online | Total_calls_made | |
|---|---|---|---|---|---|---|---|
| 0 | 1 | 87073 | 100000 | 2 | 1 | 1 | 0 |
| 1 | 2 | 38414 | 50000 | 3 | 0 | 10 | 9 |
| 2 | 3 | 17341 | 50000 | 7 | 1 | 3 | 4 |
| 3 | 4 | 40496 | 30000 | 5 | 1 | 1 | 4 |
| 4 | 5 | 47437 | 100000 | 6 | 0 | 12 | 3 |
# check last rows of data
data.tail()
| Sl_No | Customer Key | Avg_Credit_Limit | Total_Credit_Cards | Total_visits_bank | Total_visits_online | Total_calls_made | |
|---|---|---|---|---|---|---|---|
| 655 | 656 | 51108 | 99000 | 10 | 1 | 10 | 0 |
| 656 | 657 | 60732 | 84000 | 10 | 1 | 13 | 2 |
| 657 | 658 | 53834 | 145000 | 8 | 1 | 9 | 1 |
| 658 | 659 | 80655 | 172000 | 10 | 1 | 15 | 0 |
| 659 | 660 | 80150 | 167000 | 9 | 0 | 12 | 2 |
We can get a first statistical and descriptive analysis using pandas_profiling
# get pandas profiling report
pandas_profiling.ProfileReport(data)
Pandas Profiling report is showing some warnings/characteristics in the data:
# get stats for the columns
data.describe().T
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| Sl_No | 660.0 | 330.500000 | 190.669872 | 1.0 | 165.75 | 330.5 | 495.25 | 660.0 |
| Customer Key | 660.0 | 55141.443939 | 25627.772200 | 11265.0 | 33825.25 | 53874.5 | 77202.50 | 99843.0 |
| Avg_Credit_Limit | 660.0 | 34574.242424 | 37625.487804 | 3000.0 | 10000.00 | 18000.0 | 48000.00 | 200000.0 |
| Total_Credit_Cards | 660.0 | 4.706061 | 2.167835 | 1.0 | 3.00 | 5.0 | 6.00 | 10.0 |
| Total_visits_bank | 660.0 | 2.403030 | 1.631813 | 0.0 | 1.00 | 2.0 | 4.00 | 5.0 |
| Total_visits_online | 660.0 | 2.606061 | 2.935724 | 0.0 | 1.00 | 2.0 | 4.00 | 15.0 |
| Total_calls_made | 660.0 | 3.583333 | 2.865317 | 0.0 | 1.00 | 3.0 | 5.00 | 10.0 |
We are going to perform bivariate analysis to understand the relationship between the columns
# Continuous columns
con_col = ['Avg_Credit_Limit', 'Total_Credit_Cards','Total_visits_bank',
'Total_visits_online', 'Total_calls_made']
# Pairplot for continuous columns
sns.pairplot(data[con_col], diag_kind='kde');
# Get correlation matrix for numeric variables
data[con_col].corr()
| Avg_Credit_Limit | Total_Credit_Cards | Total_visits_bank | Total_visits_online | Total_calls_made | |
|---|---|---|---|---|---|
| Avg_Credit_Limit | 1.000000 | 0.608860 | -0.100312 | 0.551385 | -0.414352 |
| Total_Credit_Cards | 0.608860 | 1.000000 | 0.315796 | 0.167758 | -0.651251 |
| Total_visits_bank | -0.100312 | 0.315796 | 1.000000 | -0.551861 | -0.506016 |
| Total_visits_online | 0.551385 | 0.167758 | -0.551861 | 1.000000 | 0.127299 |
| Total_calls_made | -0.414352 | -0.651251 | -0.506016 | 0.127299 | 1.000000 |
# Display correlation matrix in a heatmap
fig, ax = plt.subplots(figsize=(7,5))
sns.heatmap(data[con_col].corr(), annot=True, ax=ax);
Features with positive correlation:
Features with negative correlation
# Drop CustomerID column
data.drop(['Sl_No'], axis=1, inplace=True)
# Duplicated rows in Customer Key
data[data['Customer Key'].duplicated(keep=False)].sort_values(by=['Customer Key'])
| Customer Key | Avg_Credit_Limit | Total_Credit_Cards | Total_visits_bank | Total_visits_online | Total_calls_made | |
|---|---|---|---|---|---|---|
| 48 | 37252 | 6000 | 4 | 0 | 2 | 8 |
| 432 | 37252 | 59000 | 6 | 2 | 1 | 2 |
| 4 | 47437 | 100000 | 6 | 0 | 12 | 3 |
| 332 | 47437 | 17000 | 7 | 3 | 1 | 0 |
| 411 | 50706 | 44000 | 4 | 5 | 0 | 2 |
| 541 | 50706 | 60000 | 7 | 5 | 2 | 2 |
| 391 | 96929 | 13000 | 4 | 5 | 0 | 0 |
| 398 | 96929 | 67000 | 6 | 2 | 2 | 2 |
| 104 | 97935 | 17000 | 2 | 1 | 2 | 10 |
| 632 | 97935 | 187000 | 7 | 1 | 7 | 0 |
# Drop Customer Key column
data.drop(['Customer Key'], axis=1, inplace=True)
We are going to analyze if there are outliers in the features
# This function create a plot with the boxplot and distribution of the series
def histboxplot(feature):
# creating the 2 subplots
f2, (ax_box1, ax_hist1) = plt.subplots(nrows = 2, # Number of rows of the subplot grid= 2
ncols = 1, # Number of Columns of the subplot grid= 1
sharex = 'col', # x-axis will be shared among columns
figsize = (7,6),
gridspec_kw = {"height_ratios": (.25, .75)});
sns.boxplot(feature, ax=ax_box1, showmeans=True, color='violet'); # boxplot
sns.distplot(feature, kde=True, ax=ax_hist1); # histogram
ax_hist1.axvline(np.mean(feature), color='green', linestyle='--'); # Add mean to the histogram
ax_hist1.axvline(np.median(feature), color='black', linestyle='-');
Avg_Credit_Limit
# display histbox
histboxplot(data['Avg_Credit_Limit'])
There are several points with average credit limit above the upper whisker 100000. However, those values could correspond to premium customers. Therefore, we are not going to clip those values
Total_Credit_Cards
# display histbox
histboxplot(data['Total_Credit_Cards'])
There are not outliers
Total_visits_bank
# display histbox
histboxplot(data['Total_visits_bank'])
There are not outliers
Total_visits_online
# display histbox
histboxplot(data['Total_visits_online'])
There are some points above the upper whisker. However, they could correspond to consumers that login more times online. Therefore, we are not going to clip those values
Total_calls_made
# display histbox
histboxplot(data['Total_calls_made'])
There are not outliers
# Scaling the data set before clustering
scaler = StandardScaler()
data_scaled = scaler.fit_transform(data)
# Creating a dataframe from the scaled data
data_scaled_df = pd.DataFrame(data_scaled, columns=data.columns)
data_scaled_df
| Avg_Credit_Limit | Total_Credit_Cards | Total_visits_bank | Total_visits_online | Total_calls_made | |
|---|---|---|---|---|---|
| 0 | 1.740187 | -1.249225 | -0.860451 | -0.547490 | -1.251537 |
| 1 | 0.410293 | -0.787585 | -1.473731 | 2.520519 | 1.891859 |
| 2 | 0.410293 | 1.058973 | -0.860451 | 0.134290 | 0.145528 |
| 3 | -0.121665 | 0.135694 | -0.860451 | -0.547490 | 0.145528 |
| 4 | 1.740187 | 0.597334 | -1.473731 | 3.202298 | -0.203739 |
| ... | ... | ... | ... | ... | ... |
| 655 | 1.713589 | 2.443892 | -0.860451 | 2.520519 | -1.251537 |
| 656 | 1.314621 | 2.443892 | -0.860451 | 3.543188 | -0.553005 |
| 657 | 2.937092 | 1.520613 | -0.860451 | 2.179629 | -0.902271 |
| 658 | 3.655235 | 2.443892 | -0.860451 | 4.224968 | -1.251537 |
| 659 | 3.522245 | 1.982253 | -1.473731 | 3.202298 | -0.553005 |
660 rows × 5 columns
We are going to use the Elbow Curve method to identify the optimal number of clusters
# start time
start_time = time.time()
# set of different number of clusters (K)
clusters = range(1, 9)
# list with distortions
meanDistortions = []
# loop in clusters
for k in clusters:
# create and fit KMeans model for k
model = KMeans(n_clusters=k)
model.fit(data_scaled_df)
# predict cluster for each row
prediction = model.predict(data_scaled_df)
# calculate average distortion using euclidean distance
distortion = (
sum(
np.min(cdist(data_scaled_df, model.cluster_centers_, "euclidean"), axis=1)
)
/ data_scaled_df.shape[0]
)
# append average distortion
meanDistortions.append(distortion)
print("Number of Clusters:", k, "\tAverage Distortion:", distortion)
# plot elbow curve
plt.plot(clusters, meanDistortions, "bx-")
plt.xlabel("k")
plt.ylabel("Average Distortion")
plt.title("Selecting k with the Elbow Method", fontsize=20)
print("\n\n--- Execution time %s seconds ---" % (time.time() - start_time))
Number of Clusters: 1 Average Distortion: 2.0069222262503614 Number of Clusters: 2 Average Distortion: 1.4571553548514269 Number of Clusters: 3 Average Distortion: 1.1466276549150365 Number of Clusters: 4 Average Distortion: 1.0463825294774465 Number of Clusters: 5 Average Distortion: 0.9908683849620168 Number of Clusters: 6 Average Distortion: 0.9426543606899347 Number of Clusters: 7 Average Distortion: 0.9095950026993534 Number of Clusters: 8 Average Distortion: 0.8902998903180782 --- Execution time 0.4954409599304199 seconds ---
Now, we are going to use the Silhouette Score to identify the optimal number of clusters
# start time
start_time = time.time()
# set of different number of clusters (K)
clusters = range(2, 10)
# list with silhouette scores
sil_score = []
# loop in clusters
for k in clusters:
# create and fit KMeans model for k
model = KMeans(n_clusters=k)
preds = model.fit_predict((data_scaled_df))
# calculate silhouette_score
score = silhouette_score(data_scaled_df, preds)
sil_score.append(score)
print("For n_clusters = {}, silhouette score is {}".format(k, score))
# plot silhouette_score
plt.plot(clusters, sil_score, "bx-")
plt.xlabel("k")
plt.ylabel("silhouette_score")
plt.title("Selecting k with Silhouette Score", fontsize=20)
print("\n\n--- Execution time %s seconds ---" % (time.time() - start_time))
For n_clusters = 2, silhouette score is 0.41842496663215445 For n_clusters = 3, silhouette score is 0.5157182558881063 For n_clusters = 4, silhouette score is 0.3556670619372605 For n_clusters = 5, silhouette score is 0.2726898791817692 For n_clusters = 6, silhouette score is 0.2553892519918698 For n_clusters = 7, silhouette score is 0.24865851948404674 For n_clusters = 8, silhouette score is 0.22708508639891067 For n_clusters = 9, silhouette score is 0.21144805721637253 --- Execution time 0.3671855926513672 seconds ---
# start time
start_time = time.time()
# Clusters with silhouette coefficients for K=3
visualizer = SilhouetteVisualizer(KMeans(3, random_state=1))
visualizer.fit(data_scaled_df)
visualizer.show();
print("\n\n--- Execution time %s seconds ---" % (time.time() - start_time))
--- Execution time 0.2004995346069336 seconds ---
K=3 is an appropriate number for clusters because it has the highest silhouette score and there is knick at 3 in the elbow curve
# start time
start_time = time.time()
# create and fit Kmeans model
kmeans = KMeans(n_clusters=3, random_state=0)
kmeans.fit(data_scaled_df)
print("\n\n--- Execution time %s seconds ---" % (time.time() - start_time))
--- Execution time 0.01558995246887207 seconds ---
Average values for each feature
# adding kmeans cluster labels to the original dataframe
data['K_means_cluster'] = kmeans.labels_
# create cluster profile
cluster_profile_kmeans = data.groupby('K_means_cluster').mean()
# number of rows in each cluster
cluster_profile_kmeans['Count'] = (data.groupby('K_means_cluster')['Avg_Credit_Limit'].count().values)
# let's display cluster profiles
cluster_profile_kmeans.style.highlight_max(color="lightgreen", axis=0)
| Avg_Credit_Limit | Total_Credit_Cards | Total_visits_bank | Total_visits_online | Total_calls_made | Count | |
|---|---|---|---|---|---|---|
| K_means_cluster | ||||||
| 0 | 33782.383420 | 5.515544 | 3.489637 | 0.981865 | 2.000000 | 386 |
| 1 | 12174.107143 | 2.410714 | 0.933036 | 3.553571 | 6.870536 | 224 |
| 2 | 141040.000000 | 8.740000 | 0.600000 | 10.900000 | 1.080000 | 50 |
Box plots for each feature
# Create subplots for the different boxplot charts
fig, axes = plt.subplots(1, 5, figsize=(16, 6))
for col in range(5):
# select the specific feature in con_col
feature = con_col[col]
# dsiplay the boxplot
sns.boxplot(data['K_means_cluster'], data[feature], ax=axes[col], )
fig.tight_layout(pad=2.0)
Now, we are going to use the Hierarchical Clustering method to group the customers
We are going to use the Cophenetic correlation to identify the appropriate distance metric and linkage method
# start time
start_time = time.time()
# list of distance metrics
distance_metrics = ['euclidean', 'chebyshev', 'mahalanobis', 'cityblock']
# list of linkage methods
linkage_methods = ['single', 'complete', 'average', 'centroid', 'ward', 'weighted']
high_cophenet_corr = 0
high_dm_lm = [0, 0]
# dataframe with Cophenetic correlation
columns = ['Metric', 'Linkage Method', 'Cophenetic correlation']#, 'Cophenetic distance']
cophenet_corr = pd.DataFrame(columns=columns)
# for each metric
for dm in distance_metrics:
# for each linkage methos
for lm in linkage_methods:
# centroid and ward methods require the distance metric to be Euclidean
if (lm=='centroid' or lm=='ward') and dm!='euclidean' :
pass
else:
# calculate Cophenetic correlation
Z = linkage(data_scaled_df, metric=dm, method=lm)
c, coph_dists = cophenet(Z, pdist(data_scaled_df))
cophenet_corr = cophenet_corr.append(pd.DataFrame([[dm, lm, c]], columns=columns),
ignore_index = True)
print("\n\n--- Execution time %s seconds ---" % (time.time() - start_time))
# sort results by Cophenetic correlation
cophenet_corr.sort_values(by='Cophenetic correlation',ascending=False)
--- Execution time 0.300518274307251 seconds ---
| Metric | Linkage Method | Cophenetic correlation | |
|---|---|---|---|
| 2 | euclidean | average | 0.897708 |
| 8 | chebyshev | average | 0.897416 |
| 16 | cityblock | average | 0.896329 |
| 3 | euclidean | centroid | 0.893939 |
| 9 | chebyshev | weighted | 0.891362 |
| 5 | euclidean | weighted | 0.886175 |
| 17 | cityblock | weighted | 0.882552 |
| 15 | cityblock | complete | 0.873148 |
| 1 | euclidean | complete | 0.859973 |
| 7 | chebyshev | complete | 0.853347 |
| 12 | mahalanobis | average | 0.832699 |
| 13 | mahalanobis | weighted | 0.780599 |
| 4 | euclidean | ward | 0.741516 |
| 0 | euclidean | single | 0.739122 |
| 6 | chebyshev | single | 0.738235 |
| 14 | cityblock | single | 0.725238 |
| 10 | mahalanobis | single | 0.705806 |
| 11 | mahalanobis | complete | 0.666353 |
Now, we are going to use the Dendograms to identify the proper number of clusters.
# start time
start_time = time.time()
# list of linkage methods
linkage_methods = ["single", "complete", "average", "centroid", "ward", "weighted"]
# lists to save results of cophenetic correlation calculation
compare_cols = ["Linkage", "Cophenetic Coefficient"]
# to create a subplot image
fig, axs = plt.subplots(len(linkage_methods), 1, figsize=(15, 30))
# We will enumerate through the list of linkage methods above
# For each linkage method, we will plot the dendrogram and calculate the cophenetic correlation
for i, method in enumerate(linkage_methods):
Z = linkage(data_scaled_df, metric="euclidean", method=method)
dendrogram(Z, ax=axs[i])
axs[i].set_title(f"Dendrogram ({method.capitalize()} Linkage)")
coph_corr, coph_dist = cophenet(Z, pdist(data_scaled_df))
axs[i].annotate(
f"Cophenetic\nCorrelation\n{coph_corr:0.2f}",
(0.80, 0.80),
xycoords="axes fraction",
)
print("\n\n--- Execution time %s seconds ---" % (time.time() - start_time))
--- Execution time 133.59463214874268 seconds ---
# start time
start_time = time.time()
HCmodel = AgglomerativeClustering(n_clusters=3, affinity="euclidean", linkage="average")
HCmodel.fit(data_scaled_df)
print("\n\n--- Execution time %s seconds ---" % (time.time() - start_time))
--- Execution time 0.009972810745239258 seconds ---
Average values for each feature
# adding kmeans cluster labels to the original dataframe
data['Hierarchical_cluster'] = HCmodel.labels_
# create cluster profile
cluster_profile_hie = data.groupby('Hierarchical_cluster').mean()
# number of rows in each cluster
cluster_profile_hie['Count'] = (data.groupby('Hierarchical_cluster')['Avg_Credit_Limit'].count().values)
# let's display cluster profiles
cluster_profile_hie.style.highlight_max(color="lightgreen", axis=0)
| Avg_Credit_Limit | Total_Credit_Cards | Total_visits_bank | Total_visits_online | Total_calls_made | K_means_cluster | Count | |
|---|---|---|---|---|---|---|---|
| Hierarchical_cluster | |||||||
| 0 | 33713.178295 | 5.511628 | 3.485788 | 0.984496 | 2.005168 | 0.002584 | 387 |
| 1 | 141040.000000 | 8.740000 | 0.600000 | 10.900000 | 1.080000 | 2.000000 | 50 |
| 2 | 12197.309417 | 2.403587 | 0.928251 | 3.560538 | 6.883408 | 1.000000 | 223 |
Box plots for each feature
# Create subplots for the different boxplot charts
fig, axes = plt.subplots(1, 5, figsize=(16, 6))
for col in range(5):
# select the specific feature in con_col
feature = con_col[col]
# dsiplay the boxplot
sns.boxplot(data['Hierarchical_cluster'], data[feature], ax=axes[col], )
fig.tight_layout(pad=2.0)
Now, we are going to compare both techniques and the clusters found in both of them
Which clustering technique took less time for execution?
Which clustering technique gave you more distinct clusters, or are they the same?
Now, we are going to compare the clusters found in each technique
# Kmeans clustering
cluster_profile_kmeans.style.highlight_max(color="lightgreen", axis=0)
| Avg_Credit_Limit | Total_Credit_Cards | Total_visits_bank | Total_visits_online | Total_calls_made | Count | |
|---|---|---|---|---|---|---|
| K_means_cluster | ||||||
| 0 | 33782.383420 | 5.515544 | 3.489637 | 0.981865 | 2.000000 | 386 |
| 1 | 12174.107143 | 2.410714 | 0.933036 | 3.553571 | 6.870536 | 224 |
| 2 | 141040.000000 | 8.740000 | 0.600000 | 10.900000 | 1.080000 | 50 |
# Hierarchical clustering
cluster_profile_hie.style.highlight_max(color="lightgreen", axis=0)
| Avg_Credit_Limit | Total_Credit_Cards | Total_visits_bank | Total_visits_online | Total_calls_made | K_means_cluster | Count | |
|---|---|---|---|---|---|---|---|
| Hierarchical_cluster | |||||||
| 0 | 33713.178295 | 5.511628 | 3.485788 | 0.984496 | 2.005168 | 0.002584 | 387 |
| 1 | 141040.000000 | 8.740000 | 0.600000 | 10.900000 | 1.080000 | 2.000000 | 50 |
| 2 | 12197.309417 | 2.403587 | 0.928251 | 3.560538 | 6.883408 | 1.000000 | 223 |
Both techniques determine almost the same clusters.
kmeans_clusters = data['K_means_cluster'].unique()
hie_clusters = data['Hierarchical_cluster'].unique()
for kmeans_cluster in kmeans_clusters:
for hierachical_cluster in hie_clusters:
ix = (data['K_means_cluster']==kmeans_cluster) & (data['Hierarchical_cluster']==hierachical_cluster)
if sum(ix)>0:
print(f'K_means cluster: {kmeans_cluster}, Hierarchical cluster: {hierachical_cluster}, Count: {sum(ix)}')
K_means cluster: 0, Hierarchical cluster: 0, Count: 386 K_means cluster: 1, Hierarchical cluster: 0, Count: 1 K_means cluster: 1, Hierarchical cluster: 2, Count: 223 K_means cluster: 2, Hierarchical cluster: 1, Count: 50
# customer in kmeans cluster 1 and hierarchical cluster 0
ix = (data['K_means_cluster']==1) & (data['Hierarchical_cluster']==0)
data[ix]
| Avg_Credit_Limit | Total_Credit_Cards | Total_visits_bank | Total_visits_online | Total_calls_made | K_means_cluster | Hierarchical_cluster | |
|---|---|---|---|---|---|---|---|
| 313 | 7000 | 4 | 2 | 2 | 4 | 1 | 0 |
This customer seems more similar to Hierarchical cluster 2 than cluster 0. Therefore, Kmeans clustering performed better clustering for this specific customer
Now, we are going to plot the different characteristics of the customers
# Create subplots for the different boxplot charts
fig, axes = plt.subplots(1, 5, figsize=(16, 6))
for col in range(5):
# select the specific feature in con_col
feature = con_col[col]
# dsiplay the boxplot
sns.boxplot(data['K_means_cluster'], data[feature], ax=axes[col], )
fig.tight_layout(pad=2.0)
There are 3 different groups of customers with the following characteristics: